This is a personal Rmarkdown document I have created to visualize the COVID-19 updates and some preliminary exploratory data analysis (EDA). The source of this data is the github repository created and maintained by the Coronavirus COVID-19 Global Cases by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU).
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(forecast))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(xts))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(gghighlight))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(directlabels))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(plotly))
#suppressPackageStartupMessages(library(rjson))
COVID_confirmed_global_raw <- read_csv("csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
COVID_deaths_global_raw <- read_csv("csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
COVID_recovered_global_raw <- read_csv("csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
# Reshape to longer format
COVID_confirmed_global_longer <- COVID_confirmed_global_raw %>%
pivot_longer(cols = c('1/22/20':names(COVID_confirmed_global_raw)[ncol(COVID_confirmed_global_raw)]),
names_to = "date",
values_to = "n_cases")
COVID_deaths_global_longer <- COVID_deaths_global_raw %>%
pivot_longer(cols = c('1/22/20':names(COVID_deaths_global_raw)[ncol(COVID_deaths_global_raw)]),
names_to = "date",
values_to = "n_cases")
COVID_recovered_global_longer <- COVID_recovered_global_raw %>%
pivot_longer(cols = c('1/22/20':names(COVID_recovered_global_raw)[ncol(COVID_recovered_global_raw)]),
names_to = "date",
values_to = "n_cases")
# change column names
colnames(COVID_confirmed_global_longer) <- c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_deaths_global_longer) <- c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_recovered_global_longer) <- c('state', 'country', 'lat', 'long','date', 'n_cases')
# drop `state` column and create a `new_cases` column
COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>%
select(-state)%>%
group_by(country, date) %>%
summarize(n_cases = sum(n_cases))
COVID_deaths_global_longer <- COVID_deaths_global_longer %>%
select(-state)%>%
group_by(country, date) %>%
summarize(n_cases = sum(n_cases))
COVID_recovered_global_longer <- COVID_recovered_global_longer %>%
select(-state) %>%
group_by(country, date) %>%
summarize(n_cases = sum(n_cases))
# convert date columns from character to date format
COVID_confirmed_global_longer$date <- as.Date(COVID_confirmed_global_longer$date, format = '%m/%d/%Y')
COVID_deaths_global_longer$date <- as.Date(COVID_deaths_global_longer$date, format = '%m/%d/%Y')
COVID_recovered_global_longer$date <- as.Date(COVID_recovered_global_longer$date, format = '%m/%d/%Y')
COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>%
arrange(country, date) %>%
mutate(new_cases = n_cases-lag(n_cases, default = 0))
COVID_deaths_global_longer <- COVID_deaths_global_longer %>%
arrange(country, date) %>%
mutate(new_cases = n_cases-lag(n_cases, default = 0))
COVID_recovered_global_longer <- COVID_recovered_global_longer %>%
arrange(country, date) %>%
mutate(new_cases = n_cases-lag(n_cases, default = 0))
Let’s look at the current data format
knitr::kable(head(COVID_confirmed_global_longer),format = 'markdown')
| country | date | n_cases | new_cases |
|---|---|---|---|
| Afghanistan | 0020-01-22 | 0 | 0 |
| Afghanistan | 0020-01-23 | 0 | 0 |
| Afghanistan | 0020-01-24 | 0 | 0 |
| Afghanistan | 0020-01-25 | 0 | 0 |
| Afghanistan | 0020-01-26 | 0 | 0 |
| Afghanistan | 0020-01-27 | 0 | 0 |
world_summary <- function() {
df1 <- COVID_confirmed_global_longer %>%
group_by(country) %>%
summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases)) %>%
ungroup() %>%
summarize(n_cases_total = sum(n_cases_today),
new_cases_total = sum(new_cases_today))
df2 <- COVID_deaths_global_longer %>%
group_by(country) %>%
summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases)) %>%
ungroup() %>%
summarize(n_cases_total = sum(n_cases_today),
new_cases_total = sum(new_cases_today))
df3 <- COVID_recovered_global_longer %>%
group_by(country) %>%
summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases)) %>%
ungroup() %>%
summarize(n_cases_total = sum(n_cases_today),
new_cases_total = sum(new_cases_today))
print(paste0("number of total confirmed cases in the world as of today: ", df1$n_cases_total, " with ", df1$new_cases_total, " new cases"))
print(paste0("number of total deaths in the world as of today: ", df2$n_cases_total, " with ", df2$new_cases_total, " new deaths"))
print(paste0("number of total recovered cases in the world as of today: ", df3$n_cases_total, " with ", df3$new_cases_total, " new cases"))
}
country_summary <- function(country1) {
df1 <- COVID_confirmed_global_longer %>% group_by(country) %>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases))
df2 <- COVID_deaths_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases))
df3 <- COVID_recovered_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases))
#
print(paste0("number of confirmed cases in ", country1, " as of today: ", df1$n_cases_today, " with ", df1$new_cases_today, " new cases"))
# df1$n_cases_today
print(paste0("number of deaths in ", country1, " as of today: ", df2$n_cases_today, " with ", df2$new_cases_today, " new deaths"))
# df2$n_cases_today
print(paste0("number of recovered cases in ", country1, " as of today: ", df3$n_cases_today, " with ", df3$new_cases_today, " new cases"))
# df3$n_cases_today
}
world_summary()
## [1] "number of total confirmed cases in the world as of today: 1917333 with 70640 new cases"
## [1] "number of total deaths in the world as of today: 119482 with 5391 new deaths"
## [1] "number of total recovered cases in the world as of today: 449038 with 26933 new cases"
country_summary("US")
## [1] "number of confirmed cases in US as of today: 580619 with 25306 new cases"
## [1] "number of deaths in US as of today: 23529 with 1509 new deaths"
## [1] "number of recovered cases in US as of today: 43482 with 10494 new cases"
country_summary("Italy")
## [1] "number of confirmed cases in Italy as of today: 159516 with 3153 new cases"
## [1] "number of deaths in Italy as of today: 20465 with 566 new deaths"
## [1] "number of recovered cases in Italy as of today: 35435 with 1224 new cases"
country_summary("Spain")
## [1] "number of confirmed cases in Spain as of today: 170099 with 3268 new cases"
## [1] "number of deaths in Spain as of today: 17756 with 547 new deaths"
## [1] "number of recovered cases in Spain as of today: 64727 with 2336 new cases"
country_summary("China")
## [1] "number of confirmed cases in China as of today: 83213 with 79 new cases"
## [1] "number of deaths in China as of today: 3345 with 2 new deaths"
## [1] "number of recovered cases in China as of today: 78039 with 83 new cases"
country_summary("Egypt")
## [1] "number of confirmed cases in Egypt as of today: 2190 with 125 new cases"
## [1] "number of deaths in Egypt as of today: 164 with 5 new deaths"
## [1] "number of recovered cases in Egypt as of today: 589 with 0 new cases"
country_summary("Germany")
## [1] "number of confirmed cases in Germany as of today: 130072 with 2218 new cases"
## [1] "number of deaths in Germany as of today: 3194 with 172 new deaths"
## [1] "number of recovered cases in Germany as of today: 64300 with 4000 new cases"
country_summary("France")
## [1] "number of confirmed cases in France as of today: 137875 with 4205 new cases"
## [1] "number of deaths in France as of today: 14986 with 574 new deaths"
## [1] "number of recovered cases in France as of today: 28001 with 532 new cases"
df <- COVID_confirmed_global_longer %>% mutate(country_sum = ifelse(n_cases > 5000, country,"other"))
df <- df %>% group_by(country_sum)
df <- df %>% summarize(count = max(n_cases))
fig <- df %>% plot_ly(labels = ~country_sum, values = ~count, text = ~country_sum)
fig <- fig %>% add_pie(hole = 0.4)
fig <- fig %>% layout(title = "Confirmed cases worldwide", showlegend = F,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
COVID_confirmed_global_longer %>%
group_by(country) %>%
plot_ly(x = ~date, y = ~n_cases, color = ~country) %>%
add_bars(text = ~country)%>%
layout(barmode = "stack",
showlegend = FALSE)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_countries <- function(df, curve_title, cumulative=TRUE, ...) {
df1 <- df %>%
dplyr::filter(country %in% list(...))
if (cumulative) {
p1 = ggplot(df1, aes(date, n_cases, group=country, color=country))+
geom_line()+
scale_x_date(date_breaks = "3 days")+
scale_y_log10(labels = function(x) format(x, scientific = FALSE),
name = "number of cases",
breaks = scales::breaks_log(n = 10))+
theme_bw()+
theme(axis.text.x = element_text(angle = 90), legend.position = "none")+
ggtitle(curve_title)+
geom_dl(data = df1, aes(label = country), method = list(dl.combine("first.points", "last.points"), cex = 0.8))
} else{
p1 = ggplot(df1, aes(date, new_cases, group=country, color=country))+
geom_line()+
scale_x_date(date_breaks = "3 days")+
scale_y_log10(labels = function(x) format(x, scientific = FALSE),
name = "number of cases",
breaks = scales::breaks_log(n = 10))+
theme_bw()+
theme(axis.text.x = element_text(angle = 90), legend.position = "none")+
ggtitle(curve_title)+
geom_dl(data = df1, aes(label = country), method = list(dl.combine("first.points", "last.points"), cex = 0.8))
}
return(p1)
}
plot_countries(COVID_confirmed_global_longer, curve_title = "Confirmed cases (cumulative)", cumulative = TRUE, "US", "Italy", "Canada", "Egypt", "china")
plot_countries(COVID_deaths_global_longer, curve_title = "Death cases (cumulative)", cumulative = TRUE,"US", "Italy", "Canada", "Egypt", "china")
plot_countries(COVID_recovered_global_longer, curve_title = "Recovered cases (cumulative)",cumulative = TRUE, "china","US", "Italy", "Canada", "Egypt", "china")
plot_countries(COVID_confirmed_global_longer, curve_title = "New confirmed cases", cumulative = FALSE,"US", "Italy", "Canada", "Egypt", "china")
plot_countries(COVID_deaths_global_longer, curve_title = "New death cases", cumulative = FALSE,"US", "Italy", "Canada", "Egypt", "china")
plot_countries(COVID_recovered_global_longer, curve_title = "New recovered cases", cumulative = FALSE,"US", "Italy", "Canada", "Egypt", "china")
Inspired by this minuteearth video. The thing about this visualization is that it doesn’t plot the Cumulative number of confirmed cases with time, instead with the number of new cases on a log-scale, which is more intuitive. Multiple comparisons between countries with very different number of cases could be very made very clear, and it is very easy to detect whether things are getting better.
COVID_confirmed_smoothed <- COVID_confirmed_global_longer %>%
tidyr::nest(-country) %>%
dplyr::mutate(m = purrr::map(data, loess,
formula = new_cases ~ n_cases, span = 0.4),
fitted = purrr::map(m, `[[`, "fitted"))
COVID_confirmed_smoothed <- COVID_confirmed_smoothed %>%
dplyr::select(-m) %>%
tidyr::unnest()
COVID_confirmed_smoothed2 <- COVID_confirmed_smoothed %>%
dplyr::filter(country %in% c("US", "China", "Italy", "Korea, South", "Iran", "Egypt", "Spain", "Germany", "France", "United Kingdom", "Canada"))
ggplot(data = COVID_confirmed_smoothed2, aes(n_cases, fitted))+
geom_path(data = COVID_confirmed_smoothed2,aes(n_cases,fitted,color = country, group = country))+
theme_bw()+
ylab("number of cases")+
scale_y_log10(labels = function(x) format(x, scientific = FALSE))+
scale_x_log10(labels = function(x) format(x, scientific = FALSE))+
geom_dl(data = COVID_confirmed_smoothed2, aes(label = country), method = list(dl.combine("first.points", "last.points"), cex = 0.8))+
xlab(label = "Total confirmed cases")+
ylab(label = "number of new cases")+
theme(legend.position="none")